import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm, uniform
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)


np.random.seed(seed=25)

DATA_SIZE = 300
w_0 = 10
w_1 = 4.5

data = pd.DataFrame(uniform(0,10).rvs(DATA_SIZE), columns=['x'])
data['y'] = w_0 + w_1*data['x'] + norm(0,2).rvs(DATA_SIZE)

fig = px.scatter(data, x="x",y='y', 
           labels = {"x" : "Hour of the day", 
                     "y" : "Ounces of coffee Kenny<br>consumed"},
          height=400, width=600)
fig.update_traces(marker={'size': 4})
fig.show()


def objfunction(w,w0,X,y):
    J = 0.5*np.sum((y - (w*X + w0))**2)
    return J


X = data['x']
y = data['y']
# compute the objective function for a range of values for w and w0
w1s = np.linspace(-4, 4, 50)
w0s = np.linspace(-50, 50, 50)
W, W0 = np.meshgrid(w1s, w0s)
Js = []
for w1,w0 in zip(W.flatten(),W0.flatten()):
    Js.append(objfunction(w1,w0,X,y))

Js = np.array(Js)
Js = np.reshape(Js,(len(w1s),len(w0s)))


cs = 'algae'
fig = go.Figure(data=[go.Surface(z=Js, x=W, y=W0,
                                 contours = {"x": {"show": True},
                                             "y": {"show": True}
                                            },
                                 showscale=False,
                                 colorscale=cs
                                )])
fig.update_traces(contours_x=dict(show=True, usecolormap=True,project_x=True))
fig.update_traces(contours_y=dict(show=True, usecolormap=True,project_y=True))
fig.update_traces(contours_z=dict(show=True, usecolormap=True,project_z=True))
fig.update_layout(title='Loss function', autosize=False,
                  width=500, height=500,
                  #scene = dict(zaxis=dict(range=[-3,3])),
                  margin=dict(l=65, r=50, b=65, t=90))


def f(x):
    return x**3-2*x**2+2

x = np.linspace(-1,2.5,1000)
plt.plot(x,f(x))
plt.xlim([-1,2.5])
plt.ylim([0,3])
plt.show()


## Closed-form optimization (w_0 and w_1)

def learn_OLS_regression_simple(x,y):
    # Inputs:                                                         
    # X = N x 1 
    # y = N x 1                                                               
    # Output: 
    # w0, w1

    
    ## LET'S IMPLEMENT!!

    return w0, w1

x = data['x']
y = data['y']
print(learn_OLS_regression_simple(x,y))

(10.152230139006953, 4.487660185206517)


def learnOLERegression(X,y):
    # Inputs:                                                         
    # X = N x d 
    # y = N x 1                                                               
    # Output: 
    # w = d x 1 

    # WRITE THE CODE THAT IMPLEMENTS THE EQUATION ABOVE!
    
    return w

X =  np.vstack([np.ones(len(x)),x]).T
learnOLERegression(X,y)

array([10.15223014,  4.48766019])


def f(x):
    return x**3-2*x**2+2

def f_prime(x):
    return 3*x**2-4*x


x = np.linspace(-1,2.5,1000)
plt.plot(x,f(x))
plt.xlim([-1,2.5])
plt.ylim([0,3])
plt.show()


x_old = 0
x_new = 2 # The algorithm starts at x=2
n_k = 0.1 # step size  ### TEST: .3, 3
precision = 0.0001

x_list, y_list = [x_new], [f(x_new)]
while abs(x_new - x_old) > precision:
    x_old = x_new
    s_k = -f_prime(x_old)
    x_new = x_old + n_k * s_k
    x_list.append(x_new)
    y_list.append(f(x_new))


plt.figure(figsize=[10,6])
plt.scatter(x_list,y_list,c="r")
plt.plot(x_list,y_list,c="r")
plt.plot(x,f(x), c="b")
plt.annotate(xy=(x_list[0],f(x_list[0])),
             xytext=(x_list[0]+1,f(x_list[0])+1),
             text='start',
             arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate(xy=(x_list[-1],f(x_list[-1])),
             xytext=(x_list[-1]-1,f(x_list[-1])),
             text='end',
             arrowprops=dict(facecolor='red', shrink=0.05),)
plt.xlim([-1,2.5])
plt.ylim([0,3])
plt.title("Gradient descent")

Text(0.5, 1.0, 'Gradient descent')


def f2(x,y):
    return 5*np.cos(x -2) + 5*np.sin(x - 2*y)

def f2_prime(x,y):
    return np.array([-5*np.sin(x -2) + 5*np.cos(x - 2*y),-10*np.cos(x-2*y)])


x = np.linspace(-5, 5, 100)
y = np.linspace(-5, 5, 100)

X, Y = np.meshgrid(x, y)
Z = f2(X, Y)


cs = 'algae'
fig = go.Figure(data=[go.Surface(z=Z, x=X, y=Y,
                                 contours = {"x": {"show": True},
                                             "y": {"show": True}
                                            },
                                 showscale=False,
                                 colorscale=cs
                                )])
fig.update_layout(title='Loss function', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))


x_old = np.array([2,-2]) #np.array([0,0])
x_new = np.array([1.5,-1.5]) #np.array([1,1])
n_k = 0.01 # step size
precision = 0.0001

x_list, y_list = [x_new], [f2(x_new[0],x_new[1])]
while np.sum(np.abs(x_new - x_old)) > precision:
    x_old = x_new
    s_k = -f2_prime(x_old[0],x_old[1])
    x_new = x_old + n_k * s_k
    x_list.append(x_new)
    y_list.append(f2(x_new[0],x_new[1]))
x_list = np.array(x_list)
y_list = np.array(y_list)


cs = 'algae'
fig = go.Figure(data=[go.Surface(z=Z, x=X, y=Y,
                                 contours = {"x": {"show": True},
                                             "y": {"show": True}
                                            },
                                 showscale=False,
                                 opacity=0.75,
                                 colorscale=cs
                                ),
                      go.Scatter3d(x=x_list[:,0], y=x_list[:,1], z=y_list,
                                 mode='markers',
                                 marker=dict(size=6,
                                             color=np.arange(len(y_list)),
                                             colorscale='reds',
                                             opacity=0.8)
                                )
                     ])
fig.update_layout(title='Loss function', autosize=False,
                  width=700, height=700,
                  margin=dict(l=65, r=50, b=65, t=90))


def regressionObjVal(w, X, y):

    # compute squared error (scalar) with respect
    # to w (vector) for the given data X and y      
    #
    # Inputs:
    # w = d x 1
    # X = N x d
    # y = N x 1
    # Output:
    # error = scalar value

    # IMPLEMENT THIS METHOD

    return error


def regressionGradient(w, X, y):

    # compute gradient of squared error (scalar) with respect
    # to w (vector) for the given data X and y   
    
    # Inputs:
    # w = d x 1
    # X = N x d
    # y = N x 1
    # Output:
    # gradient = d length vector (not a d x 1 matrix)

    # IMPLEMENT!

    return J


from scipy.optimize import minimize
x = data['x']
y = data['y'].values.reshape(len(data),1)
X =  np.vstack([np.ones(len(x)),x]).T
args = (X,y)
opts = {'maxiter' : 100}    # Preferred value.    
w_init = np.ones((X.shape[1],1))
soln = minimize(regressionObjVal, w_init, jac=regressionGradient, args=args,method='CG', options=opts)
soln.x

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/var/folders/rf/4jz25sh1733_8b28phhhrlfh0000gn/T/ipykernel_49147/3555149442.py in <module>
      6 opts = {'maxiter' : 100}    # Preferred value.
      7 w_init = np.ones((X.shape[1],1))
----> 8 soln = minimize(regressionObjVal, w_init, jac=regressionGradient, args=args,method='CG', options=opts)
      9 soln.x

~/opt/anaconda3/lib/python3.8/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    614         return _minimize_powell(fun, x0, args, callback, bounds, **options)
    615     elif meth == 'cg':
--> 616         return _minimize_cg(fun, x0, args, jac, callback, **options)
    617     elif meth == 'bfgs':
    618         return _minimize_bfgs(fun, x0, args, jac, callback, **options)

~/opt/anaconda3/lib/python3.8/site-packages/scipy/optimize/optimize.py in _minimize_cg(fun, x0, args, jac, callback, gtol, norm, eps, maxiter, disp, return_all, finite_diff_rel_step, **unknown_options)
   1520         maxiter = len(x0) * 200
   1521 
-> 1522     sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
   1523                                   finite_diff_rel_step=finite_diff_rel_step)
   1524 

~/opt/anaconda3/lib/python3.8/site-packages/scipy/optimize/optimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
    259     # ScalarFunction caches. Reuse of fun(x) during grad
    260     # calculation reduces overall function evaluations.
--> 261     sf = ScalarFunction(fun, x0, args, grad, hess,
    262                         finite_diff_rel_step, bounds, epsilon=epsilon)
    263 

~/opt/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
    138 
    139         self._update_fun_impl = update_fun
--> 140         self._update_fun()
    141 
    142         # Gradient evaluation

~/opt/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in _update_fun(self)
    231     def _update_fun(self):
    232         if not self.f_updated:
--> 233             self._update_fun_impl()
    234             self.f_updated = True
    235 

~/opt/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in update_fun()
    135 
    136         def update_fun():
--> 137             self.f = fun_wrapped(self.x)
    138 
    139         self._update_fun_impl = update_fun

~/opt/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in fun_wrapped(x)
    132             # Overwriting results in undefined behaviour because
    133             # fun(self.x) will change self.x, with the two no longer linked.
--> 134             return fun(np.copy(x), *args)
    135 
    136         def update_fun():

/var/folders/rf/4jz25sh1733_8b28phhhrlfh0000gn/T/ipykernel_49147/3346368719.py in regressionObjVal(w, X, y)
     13     # IMPLEMENT THIS METHOD
     14 
---> 15     return error

NameError: name 'error' is not defined


X = np.array([[1,3,4,7],[1,4,2,6],[1,8,-3,5],[1,4,7,11],[1,3,2,5]])
print(X)
y = np.array([5,2,3,9,7])
print(y)

[[ 1  3  4  7]
 [ 1  4  2  6]
 [ 1  8 -3  5]
 [ 1  4  7 11]
 [ 1  3  2  5]]
[5 2 3 9 7]


np.dot(X.transpose(),X)

array([[  5,  22,  12,  34],
       [ 22, 114,  30, 144],
       [ 12,  30,  82, 112],
       [ 34, 144, 112, 256]])


np.linalg.inv(np.dot(X.transpose(),X))

---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
/var/folders/rf/4jz25sh1733_8b28phhhrlfh0000gn/T/ipykernel_38529/141689866.py in <module>
----> 1 np.linalg.inv(np.dot(X.transpose(),X))

<__array_function__ internals> in inv(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/numpy/linalg/linalg.py in inv(a)
    543     signature = 'D->D' if isComplexType(t) else 'd->d'
    544     extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
--> 545     ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
    546     return wrap(ainv.astype(result_t, copy=False))
    547 

~/opt/anaconda3/lib/python3.8/site-packages/numpy/linalg/linalg.py in _raise_linalgerror_singular(err, flag)
     86 
     87 def _raise_linalgerror_singular(err, flag):
---> 88     raise LinAlgError("Singular matrix")
     89 
     90 def _raise_linalgerror_nonposdef(err, flag):

LinAlgError: Singular matrix


from sklearn.linear_model import LinearRegression
lmodel = LinearRegression(fit_intercept=False)
lmodel.fit(X,y)
print(lmodel.coef_)

[ 2.19433198 -0.0242915   0.3562753   0.33198381]


#A helper method for pretty-printing linear models
def pretty_print_linear(coefs, intercept, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(1,1+len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return "%6.3f"%intercept+" + " +" + ".join("%6.3f * %s" % (coef, name)
                                   for coef, name in lst)


#generate some data
x = np.arange(20)
w = np.array([-3.8,0.11])
y = w[0] + w[1]*x
sigma2 = 0.1
y = y + np.random.normal(0,np.sqrt(sigma2),x.shape[0])
plt.scatter(x,y)

<matplotlib.collections.PathCollection at 0x7fade004d460>


def calculateSquaredLoss(X,y,w):
    X1 = np.hstack([np.ones([X.shape[0],1]),X])
    ypred = np.dot(X1,w)
    return 0.5*np.sum(np.power(ypred - y,2))


from sklearn.linear_model import LinearRegression
# fit model - note that LinearRegression's fit function adds the intercept by default
x = np.transpose(np.reshape(x,[1,len(x)]))
y = np.transpose(np.reshape(y,[1,len(y)]))
lr = LinearRegression()
lr.fit(x,y)
print(pretty_print_linear(lr.coef_,lr.intercept_))

-3.775 +  0.104 * X1


xtest = np.transpose(np.reshape(range(25),[1,len(range(25))]))
ytest = lr.predict(xtest)
plt.scatter(x,y)
plt.plot(xtest,ytest, color="red")

[<matplotlib.lines.Line2D at 0x7fae10d74070>]


# adding outliers
y[19] = -3*y[0]
plt.scatter(x,y)

<matplotlib.collections.PathCollection at 0x7fae117374f0>


# fit model - note that LinearRegression's fit function adds the intercept by default
x = np.transpose(np.reshape(x,[1,len(x)]))
y = np.transpose(np.reshape(y,[1,len(y)]))
lr = LinearRegression()
lr.fit(x,y)
print(pretty_print_linear(lr.coef_,lr.intercept_))

-4.944 +  0.299 * X1


xtest = np.transpose(np.reshape(range(25),[1,len(range(25))]))
ytest = lr.predict(xtest)
plt.scatter(x,y)
plt.plot(xtest, ytest, color="red")

[<matplotlib.lines.Line2D at 0x7fae51b3eca0>]


import statsmodels.api as sm
x1 = np.transpose(np.vstack([np.ones(x.shape[0]),x.flatten()]))
# Fit model and print summary
rlm_model = sm.RLM(y, x1, M=sm.robust.norms.HuberT())
w = rlm_model.fit()
w =  w.params
print(pretty_print_linear(w[1:],w[0]))

-3.855 +  0.119 * X1


xtest = np.transpose(np.reshape(range(25),[1,len(range(25))]))
ytest = w[0] + w[1]*xtest
plt.scatter(x,y)
plt.plot(xtest, ytest, color="red")

[<matplotlib.lines.Line2D at 0x7fae32b3d550>]

Optimization for Linear Regression¶

Setup¶

Data¶

Big picture: What are we trying to do?¶

Optimization approach 1: Closed Form¶

How to do it - $w_0$ and $w_1$ only¶

Moving to the more general case¶

Method 2: Gradient Descent¶

An intro to gradient descent in general¶

Linear Regression with Gradient Descent¶

Method 3: Stochastic Gradient Descent¶

Two Miscellaneous Things¶

Numerical instability¶

Looking ahead: The problem with outliers¶

Impact of outliers¶

A solution: use a different model class!¶

Wrapping up¶